library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)
source('functions.r')
load("Table_construction.Rdata")
features = features %>%
# Add other useful information:
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
# Create as many features as possible:
mutate(
raw_score = `Risk of Recidivism_raw_score`, # Adjust for violent/general
decile_score = `Risk of Recidivism_decile_score`, # Adjust for violent/general
p_jail30 = pmin(p_jail30,5),
p_prison30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
# Subscales:
crim_inv = p_charge+
p_jail30+
p_prison+
p_probation,
# Filters (TRUE for obserations to keep)
filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, # Filter 1
filt3 = !is.na(current_offense_date), # Filter 3
filt4 = current_offense_date <= current_offense_date_limit, # Filter 4
filt5 = p_current_age > 18 & p_current_age <= 70 # Filter 5
)
features_f_age = features %>%
filter(filt1,filt5) %>%
select(p_current_age, raw_score)
lb_age = features_f_age %>%
group_by(p_current_age) %>%
#arrange(raw_score, .by_group=TRUE) %>%
arrange(raw_score) %>%
top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value
mdl_age = lm(raw_score ~
I(p_current_age^2) +
p_current_age,
data=lb_age)
# More precision for paper
summary(mdl_age)
##
## Call:
## lm(formula = raw_score ~ I(p_current_age^2) + p_current_age,
## data = lb_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37713 -0.01170 0.01189 0.03591 0.12386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0835937 0.0670948 -1.246 0.216
## I(p_current_age^2) 0.0004320 0.0000364 11.868 <2e-16 ***
## p_current_age -0.0726088 0.0032499 -22.342 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06507 on 98 degrees of freedom
## Multiple R-squared: 0.9819, Adjusted R-squared: 0.9816
## F-statistic: 2663 on 2 and 98 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "-8.35936926784002359847e-02" "4.32026931047234277576e-04"
## [3] "-7.26087894972571729069e-02"
## Add f(age) to features
features = features %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__f_age = raw_score - f_age,
filt6 = raw_score >= f_age
)
## Age polynomial plot
xmin = 18
xmax = 70
xx = seq(xmin,xmax, length.out=1000)
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_f_age) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="none")
ggsave("Figures/age_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
features %>%
filter(filt1,filt3) %>%
select(crim_inv, raw_score__f_age, filt6) %>%
ggplot() +
geom_point(aes(x=crim_inv,y=raw_score__f_age,color=filt6),alpha=.5) +
xlim(c(0,70))+
theme_bw()+
xlab("Sum of Criminal Involvement Components") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 10 rows containing missing values (geom_point).
ggsave("Figures/crim_inv_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 10 rows containing missing values (geom_point).
Also look at just number of priors
features %>%
filter(filt1,filt3) %>%
select(p_charge, raw_score__f_age, filt6) %>%
ggplot() +
geom_point(aes(x=p_charge,y=raw_score__f_age,color=filt6),alpha=.5) +
xlim(c(0,70))+
theme_bw()+
xlab("Number of prior charges") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 6 rows containing missing values (geom_point).
ggsave("Figures/priors_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 6 rows containing missing values (geom_point).
There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race
#### Generic stuff (applies to all models)
## Filter rows
features_filt = features %>%
filter(filt1, filt3)
## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:5, lm = NA, xgb = NA, rf = NA, svm = NA)
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.66318 -0.45222 -0.06688 0.37365 2.54723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0928535 0.0177452 61.586 <2e-16 ***
## p_age_first_offense -0.0071197 0.0005582 -12.754 <2e-16 ***
## p_charge 0.0255577 0.0010355 24.682 <2e-16 ***
## p_jail30 -0.0063157 0.0404885 -0.156 0.876
## p_prison 0.2072928 0.0085123 24.352 <2e-16 ***
## p_probation 0.1162716 0.0074759 15.553 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.576 on 9036 degrees of freedom
## Multiple R-squared: 0.4043, Adjusted R-squared: 0.4039
## F-statistic: 1226 on 5 and 9036 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 15
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "1"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 19
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.3333333"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.48683 -0.43624 -0.06184 0.36368 2.39440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7692314 0.0298043 25.809 < 2e-16 ***
## p_age_first_offense -0.0051199 0.0005678 -9.017 < 2e-16 ***
## p_charge 0.0246614 0.0010185 24.214 < 2e-16 ***
## p_jail30 0.0077075 0.0397802 0.194 0.84638
## p_prison 0.1972350 0.0083913 23.505 < 2e-16 ***
## p_probation 0.1130858 0.0073464 15.393 < 2e-16 ***
## race_black 0.3713524 0.0257228 14.437 < 2e-16 ***
## race_white 0.2452000 0.0259959 9.432 < 2e-16 ***
## race_hispanic 0.0869787 0.0311562 2.792 0.00525 **
## race_asian 0.0914559 0.0859477 1.064 0.28732
## race_native 0.2581171 0.1097063 2.353 0.01865 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5657 on 9031 degrees of freedom
## Multiple R-squared: 0.4258, Adjusted R-squared: 0.4251
## F-statistic: 669.6 on 10 and 9031 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
data.frame(xgboost = pred, compas=features_filt$raw_score) %>%
ggplot() +
geom_point(aes(x=xgboost,y=compas), alpha=.3) +
theme_bw()+
xlab("XGBoost prediction") +
ylab("COMPAS raw score")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(6778)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.09090909"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27350 -0.44879 -0.06964 0.37017 2.55806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.054042 0.018181 57.975 <2e-16 ***
## p_current_age 0.010905 0.001207 9.038 <2e-16 ***
## p_age_first_offense -0.017448 0.001271 -13.730 <2e-16 ***
## p_charge 0.022968 0.001070 21.465 <2e-16 ***
## p_jail30 0.020513 0.040418 0.508 0.612
## p_prison 0.184563 0.008840 20.878 <2e-16 ***
## p_probation 0.096116 0.007770 12.371 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5735 on 9035 degrees of freedom
## Multiple R-squared: 0.4096, Adjusted R-squared: 0.4092
## F-statistic: 1045 on 6 and 9035 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(5)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.1428571"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.09545 -0.43035 -0.06297 0.35908 2.40230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.730549 0.029972 24.374 < 2e-16 ***
## p_current_age 0.010817 0.001188 9.107 < 2e-16 ***
## p_age_first_offense -0.015319 0.001254 -12.212 < 2e-16 ***
## p_charge 0.022088 0.001053 20.986 < 2e-16 ***
## p_jail30 0.034671 0.039712 0.873 0.38265
## p_prison 0.174487 0.008719 20.012 < 2e-16 ***
## p_probation 0.093178 0.007633 12.207 < 2e-16 ***
## race_black 0.371997 0.025607 14.527 < 2e-16 ***
## race_white 0.239606 0.025886 9.256 < 2e-16 ***
## race_hispanic 0.093341 0.031024 3.009 0.00263 **
## race_asian 0.100055 0.085566 1.169 0.24230
## race_native 0.247029 0.109219 2.262 0.02373 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5632 on 9030 degrees of freedom
## Multiple R-squared: 0.431, Adjusted R-squared: 0.4303
## F-statistic: 621.8 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/raw-fage_xgboost_gp4_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)
ggplot() +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab(expression(XGBoost~pred.~of~general~score~-~f[age])) +
ylab("General score")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank()) +
xlim(0.2,3.5)
ggsave("Figures/xgboost_rawScore_annotate_general.pdf",width = 4, height = 4, units = "in")
set.seed(3720)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.08333333"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 5 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_arrest,
p_jail30,
p_prison30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96855 -0.43265 -0.06245 0.36092 2.40755
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.733474 0.029954 24.486 < 2e-16 ***
## p_current_age 0.011331 0.001193 9.497 < 2e-16 ***
## p_age_first_offense -0.015864 0.001260 -12.587 < 2e-16 ***
## p_charge 0.014385 0.002145 6.707 2.1e-11 ***
## p_arrest 0.006756 0.001639 4.121 3.8e-05 ***
## p_jail30 0.023419 0.039770 0.589 0.55597
## p_prison30 NA NA NA NA
## p_prison 0.170400 0.008768 19.435 < 2e-16 ***
## p_probation 0.081193 0.008162 9.948 < 2e-16 ***
## race_black 0.373132 0.025586 14.584 < 2e-16 ***
## race_white 0.241091 0.025866 9.321 < 2e-16 ***
## race_hispanic 0.095503 0.031001 3.081 0.00207 **
## race_asian 0.100802 0.085490 1.179 0.23839
## race_native 0.242206 0.109129 2.219 0.02648 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5627 on 9029 degrees of freedom
## Multiple R-squared: 0.432, Adjusted R-squared: 0.4313
## F-statistic: 572.4 on 12 and 9029 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(1123)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 21
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1428571"
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 1 | 0.5758467 | 0.5260647 | 0.5552924 | 0.5298834 |
| 2 | 0.5653701 | 0.5133255 | 0.5274903 | 0.5231372 |
| 3 | 0.5732612 | 0.5173124 | 0.5324007 | 0.5260381 |
| 4 | 0.5627913 | 0.5063120 | 0.5250373 | 0.5195994 |
| 5 | 0.5622628 | 0.4941209 | 0.5139356 | 0.5029710 |
We use the “Group 3” models but predict the raw score. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound.
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score) %>% as.matrix(),
"label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(540)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.511"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("General score") +
ylab("XGBoost prediction")+
#annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
propub = features_filt %>%
filter(filt4) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_charge +
is_misdem +
recid,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_charge + is_misdem + recid, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7344 -0.7673 -0.3035 0.8381 2.6740
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.593179 0.082309 -19.356 < 2e-16 ***
## female 0.123516 0.085171 1.450 0.1470
## age_high -1.489311 0.129772 -11.476 < 2e-16 ***
## age_low 1.445909 0.071243 20.296 < 2e-16 ***
## as.factor(race)African-American 0.521072 0.072977 7.140 9.32e-13 ***
## as.factor(race)Asian -0.271728 0.503999 -0.539 0.5898
## as.factor(race)Hispanic -0.301324 0.130461 -2.310 0.0209 *
## as.factor(race)Native American 0.390718 0.678081 0.576 0.5645
## as.factor(race)Other -0.713647 0.159728 -4.468 7.90e-06 ***
## p_charge 0.155033 0.006521 23.773 < 2e-16 ***
## is_misdem -0.464124 0.069574 -6.671 2.54e-11 ***
## recid 0.491790 0.068811 7.147 8.87e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7907.2 on 5758 degrees of freedom
## Residual deviance: 5665.7 on 5747 degrees of freedom
## AIC: 5689.7
##
## Number of Fisher Scoring iterations: 5